{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 👾Qwen2大模型微调入门\n",
    "\n",
    "作者：林泽毅\n",
    "\n",
    "教程文章：https://zhuanlan.zhihu.com/p/702491999  \n",
    "\n",
    "显存要求：10GB左右  \n",
    "\n",
    "实验过程看：https://swanlab.cn/@ZeyiLin/Qwen2-fintune/runs/cfg5f8dzkp6vouxzaxlx6/chart"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.安装环境\n",
    "\n",
    "本案例测试于modelscope==1.14.0、transformers==4.41.2、datasets==2.18.0、peft==0.11.1、accelerate==0.30.1、swanlab==0.3.9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install torch swanlab modelscope transformers datasets peft pandas accelerate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果是第一次使用SwanLab，则前往[SwanLab](https://swanlab.cn)注册账号后，在[用户设置](https://swanlab.cn/settings/overview)复制API Key，如果执行下面的代码："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: You are already logged in. Use `\u001b[1mswanlab login --relogin\u001b[0m` to force relogin.\n"
     ]
    }
   ],
   "source": [
    "!swanlab login"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 数据集加载\n",
    "\n",
    "1. 在[zh_cls_fudan-news - modelscope](https://modelscope.cn/datasets/huangjintao/zh_cls_fudan-news/files)下载train.jsonl和test.jsonl到同级目录下。\n",
    "\n",
    "<img src=\"../assets/dataset.png\" width=600>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. 将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "def dataset_jsonl_transfer(origin_path, new_path):\n",
    "    \"\"\"\n",
    "    将原始数据集转换为大模型微调所需数据格式的新数据集\n",
    "    \"\"\"\n",
    "    messages = []\n",
    "\n",
    "    # 读取旧的JSONL文件\n",
    "    with open(origin_path, \"r\") as file:\n",
    "        for line in file:\n",
    "            # 解析每一行的json数据\n",
    "            data = json.loads(line)\n",
    "            context = data[\"text\"]\n",
    "            catagory = data[\"category\"]\n",
    "            label = data[\"output\"]\n",
    "            message = {\n",
    "                \"instruction\": \"你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型\",\n",
    "                \"input\": f\"文本:{context},类型选型:{catagory}\",\n",
    "                \"output\": label,\n",
    "            }\n",
    "            messages.append(message)\n",
    "\n",
    "    # 保存重构后的JSONL文件\n",
    "    with open(new_path, \"w\", encoding=\"utf-8\") as file:\n",
    "        for message in messages:\n",
    "            file.write(json.dumps(message, ensure_ascii=False) + \"\\n\")\n",
    "\n",
    "\n",
    "# 加载、处理数据集和测试集\n",
    "train_dataset_path = \"train.jsonl\"\n",
    "test_dataset_path = \"test.jsonl\"\n",
    "\n",
    "train_jsonl_new_path = \"new_train.jsonl\"\n",
    "test_jsonl_new_path = \"new_test.jsonl\"\n",
    "\n",
    "if not os.path.exists(train_jsonl_new_path):\n",
    "    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)\n",
    "if not os.path.exists(test_jsonl_new_path):\n",
    "    dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)\n",
    "\n",
    "train_df = pd.read_json(train_jsonl_new_path, lines=True)[:1000]  # 取前1000条做训练（可选）\n",
    "test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10]  # 取前10条做主观评测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 下载/加载模型和tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from modelscope import snapshot_download, AutoTokenizer\n",
    "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq\n",
    "import torch\n",
    "\n",
    "# 在modelscope上下载Qwen模型到本地目录下\n",
    "model_dir = snapshot_download(\"qwen/Qwen2-1.5B-Instruct\", cache_dir=\"./\", revision=\"master\")\n",
    "\n",
    "# Transformers加载模型权重\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./qwen/Qwen2-1___5B-Instruct/\", use_fast=False, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(\"./qwen/Qwen2-1___5B-Instruct/\", device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
    "model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. 预处理训练数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_func(example):\n",
    "    \"\"\"\n",
    "    将数据集进行预处理\n",
    "    \"\"\"\n",
    "    MAX_LENGTH = 384\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    instruction = tokenizer(\n",
    "        f\"<|im_start|>system\\n你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型<|im_end|>\\n<|im_start|>user\\n{example['input']}<|im_end|>\\n<|im_start|>assistant\\n\",\n",
    "        add_special_tokens=False,\n",
    "    )\n",
    "    response = tokenizer(f\"{example['output']}\", add_special_tokens=False)\n",
    "    input_ids = (\n",
    "        instruction[\"input_ids\"] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
    "    )\n",
    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"] + [1]\n",
    "    labels = (\n",
    "        [-100] * len(instruction[\"input_ids\"])\n",
    "        + response[\"input_ids\"]\n",
    "        + [tokenizer.pad_token_id]\n",
    "    )\n",
    "    if len(input_ids) > MAX_LENGTH:  # 做一个截断\n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "    return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "train_ds = Dataset.from_pandas(train_df)\n",
    "train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. 设置LORA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peft import LoraConfig, TaskType, get_peft_model\n",
    "\n",
    "config = LoraConfig(\n",
    "    task_type=TaskType.CAUSAL_LM,\n",
    "    target_modules=[\n",
    "        \"q_proj\",\n",
    "        \"k_proj\",\n",
    "        \"v_proj\",\n",
    "        \"o_proj\",\n",
    "        \"gate_proj\",\n",
    "        \"up_proj\",\n",
    "        \"down_proj\",\n",
    "    ],\n",
    "    inference_mode=False,  # 训练模式\n",
    "    r=8,  # Lora 秩\n",
    "    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理\n",
    "    lora_dropout=0.1,  # Dropout 比例\n",
    ")\n",
    "\n",
    "model = get_peft_model(model, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. 训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir=\"./output/Qwen2\",\n",
    "    per_device_train_batch_size=4,\n",
    "    gradient_accumulation_steps=4,\n",
    "    logging_steps=10,\n",
    "    num_train_epochs=2,\n",
    "    save_steps=100,\n",
    "    learning_rate=1e-4,\n",
    "    save_on_each_node=True,\n",
    "    gradient_checkpointing=True,\n",
    "    report_to=\"none\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from swanlab.integration.huggingface import SwanLabCallback\n",
    "import swanlab\n",
    "\n",
    "swanlab_callback = SwanLabCallback(\n",
    "    project=\"Qwen2-fintune\",\n",
    "    experiment_name=\"Qwen2-1.5B-Instruct\",\n",
    "    description=\"使用通义千问Qwen2-1.5B-Instruct模型在zh_cls_fudan-news数据集上微调。\",\n",
    "    config={\n",
    "        \"model\": \"qwen/Qwen2-1.5B-Instruct\",\n",
    "        \"dataset\": \"huangjintao/zh_cls_fudan-news\",\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=train_dataset,\n",
    "    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),\n",
    "    callbacks=[swanlab_callback],\n",
    ")\n",
    "\n",
    "trainer.train()\n",
    "\n",
    "\n",
    "# ====== 训练结束后的预测 ===== #\n",
    "\n",
    "def predict(messages, model, tokenizer):\n",
    "    device = \"cuda\"\n",
    "    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
    "    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)\n",
    "    generated_ids = [\n",
    "        output_ids[len(input_ids) :]\n",
    "        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "    ]\n",
    "\n",
    "    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "    print(response)\n",
    "\n",
    "    return response\n",
    "    \n",
    "\n",
    "test_text_list = []\n",
    "for index, row in test_df.iterrows():\n",
    "    instruction = row[\"instruction\"]\n",
    "    input_value = row[\"input\"]\n",
    "\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": f\"{instruction}\"},\n",
    "        {\"role\": \"user\", \"content\": f\"{input_value}\"},\n",
    "    ]\n",
    "\n",
    "    response = predict(messages, model, tokenizer)\n",
    "    messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n",
    "    result_text = f\"{messages[0]}\\n\\n{messages[1]}\\n\\n{messages[2]}\"\n",
    "    test_text_list.append(swanlab.Text(result_text, caption=response))\n",
    "\n",
    "swanlab.log({\"Prediction\": test_text_list})\n",
    "swanlab.finish()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
